library(tidyverse)
replacing previous import ‘vctrs::data_frame’ by ‘tibble::data_frame’ when loading ‘dplyr’Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[37m── [1mAttaching packages[22m ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[37m[32m✓[37m [34mggplot2[37m 3.3.2 [32m✓[37m [34mpurrr [37m 0.3.4
[32m✓[37m [34mtibble [37m 3.0.3 [32m✓[37m [34mdplyr [37m 1.0.0
[32m✓[37m [34mtidyr [37m 1.1.0 [32m✓[37m [34mstringr[37m 1.4.0
[32m✓[37m [34mreadr [37m 1.3.1 [32m✓[37m [34mforcats[37m 0.5.0[39m
package ‘ggplot2’ was built under R version 3.6.2package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2package ‘dplyr’ was built under R version 3.6.2[37m── [1mConflicts[22m ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
house_data <- read_csv("data/kc_house_data.csv")
Parsed with column specification:
cols(
.default = col_double(),
id = [31mcol_character()[39m,
date = [34mcol_datetime(format = "")[39m
)
See spec(...) for full column specifications.
CODECLAN SOILUTION
houses <- read_csv("data/kc_house_data.csv")
Parsed with column specification:
cols(
.default = col_double(),
id = [31mcol_character()[39m,
date = [34mcol_datetime(format = "")[39m
)
See spec(...) for full column specifications.
glimpse(houses)
Rows: 21,613
Columns: 21
$ id [3m[38;5;246m<chr>[39m[23m "7129300520", "6414100192", "5631500400", "2487200875", "1954400510", "723…
$ date [3m[38;5;246m<dttm>[39m[23m 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-02-18, 2014-05-12, 2…
$ price [3m[38;5;246m<dbl>[39m[23m 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 3…
$ bedrooms [3m[38;5;246m<dbl>[39m[23m 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3,…
$ bathrooms [3m[38;5;246m<dbl>[39m[23m 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.…
$ sqft_living [3m[38;5;246m<dbl>[39m[23m 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 143…
$ sqft_lot [3m[38;5;246m<dbl>[39m[23m 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000,…
$ floors [3m[38;5;246m<dbl>[39m[23m 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5,…
$ waterfront [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ view [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0,…
$ condition [3m[38;5;246m<dbl>[39m[23m 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4,…
$ grade [3m[38;5;246m<dbl>[39m[23m 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8…
$ sqft_above [3m[38;5;246m<dbl>[39m[23m 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430…
$ sqft_basement [3m[38;5;246m<dbl>[39m[23m 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0…
$ yr_built [3m[38;5;246m<dbl>[39m[23m 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 19…
$ yr_renovated [3m[38;5;246m<dbl>[39m[23m 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ zipcode [3m[38;5;246m<dbl>[39m[23m 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, 98146, 98038, 9800…
$ lat [3m[38;5;246m<dbl>[39m[23m 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47…
$ long [3m[38;5;246m<dbl>[39m[23m -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122…
$ sqft_living15 [3m[38;5;246m<dbl>[39m[23m 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2390, 2210, 1330, 17…
$ sqft_lot15 [3m[38;5;246m<dbl>[39m[23m 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, 7570, 8925, 6000, …
library(modelr)
house_data
Tidy up the data ready for regression:
You might like to think about removing some or all of date, id, sqft_living15, sqft_lot15 and zipcode (lat and long provide a better measure of location in any event).
house_data_tidy <- house_data %>%
select(-c("date", "id", "sqft_living15", "sqft_lot15", "zipcode"))
house_data_tidy
CODECLAN SOLUTION
# tidy up data. In particular treat condition and grade as factor, as they are
# ordinal categorical
houses_tidy <- houses %>%
select(-c("id", "date", "sqft_living15", "sqft_lot15", "zipcode")) %>%
mutate(waterfront = as.logical(waterfront)) %>%
mutate(renovated = yr_renovated != 0) %>%
select(-"yr_renovated") %>%
mutate(condition = as_factor(condition)) %>%
mutate(grade = as_factor(grade))
glimpse(houses_tidy)
Rows: 21,613
Columns: 16
$ price [3m[38;5;246m<dbl>[39m[23m 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 3…
$ bedrooms [3m[38;5;246m<dbl>[39m[23m 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3,…
$ bathrooms [3m[38;5;246m<dbl>[39m[23m 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.…
$ sqft_living [3m[38;5;246m<dbl>[39m[23m 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 143…
$ sqft_lot [3m[38;5;246m<dbl>[39m[23m 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000,…
$ floors [3m[38;5;246m<dbl>[39m[23m 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5,…
$ waterfront [3m[38;5;246m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ view [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0,…
$ condition [3m[38;5;246m<fct>[39m[23m 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4,…
$ grade [3m[38;5;246m<fct>[39m[23m 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8…
$ sqft_above [3m[38;5;246m<dbl>[39m[23m 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430…
$ sqft_basement [3m[38;5;246m<dbl>[39m[23m 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0…
$ yr_built [3m[38;5;246m<dbl>[39m[23m 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 19…
$ lat [3m[38;5;246m<dbl>[39m[23m 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47…
$ long [3m[38;5;246m<dbl>[39m[23m -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122…
$ renovated [3m[38;5;246m<lgl>[39m[23m FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
Have a think about how to treat waterfront. Should we convert its type?
We converted yr_renovated into a renovated logical variable, indicating whether the property had ever been renovated. You may wish to do the same.
Have a think about how to treat condition and grade? Are they interval or categorical ordinal data types?
Check for aliased variables using the alias() function (this takes in a formula object and a data set). [Hint - formula price ~ . says ‘price varying with all predictors’, this is a suitable input to alias()]. Remove variables that lead to an alias. Check the ‘Elements of multiple regression’ lesson for a dropdown containing further information on finding aliased variables in a dataset.
library(modelr)
package ‘modelr’ was built under R version 3.6.2
alias(lm(price~ ., data = house_data_tidy))
Model :
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + sqft_above + sqft_basement +
yr_built + yr_renovated + lat + long
Complete :
(Intercept) bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition
sqft_basement 0 0 0 1 0 0 0 0 0
grade sqft_above yr_built yr_renovated lat long
sqft_basement 0 -1 0 0 0 0
house_price_tidy <- house_data_tidy %>%
select(-c("sqft_living", "sqft_above"))
CODECLAN SOLUTION
# Alias is useful to check if we have aliased variables, i.e. one or more
# variables that can be computed from other variables
alias(price ~ ., data = houses_tidy)
Model :
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + sqft_above + sqft_basement +
yr_built + lat + long + renovated
Complete :
(Intercept) bedrooms bathrooms sqft_living sqft_lot floors waterfrontTRUE view
sqft_basement 0 0 0 1 0 0 0 0
condition2 condition3 condition4 condition5 grade3 grade4 grade5 grade6 grade7
sqft_basement 0 0 0 0 0 0 0 0 0
grade8 grade9 grade10 grade11 grade12 grade13 sqft_above yr_built lat long
sqft_basement 0 0 0 0 0 0 -1 0 0 0
renovatedTRUE
sqft_basement 0
# seems that sqft_basement can be computed from sqft_living - sqft_above.
# let's drop sqft_living leaving just the two contributions sqft_basement and
# sqft_above
houses_tidy <- houses_tidy %>%
select(-c("sqft_living"))
Error: Can't subset columns that don't exist.
[31mx[39m Column `sqft_living` doesn't exist.
Systematically build a regression model containing up to four main effects (remember, a main effect is just a single predictor with coefficient), testing the regression diagnostics as you go
splitting datasets into numeric and non-numeric columns might help ggpairs() run in manageable time, although you will need to add either a price or resid column to the non-numeric dataframe in order to see its correlations with the non-numeric predictors.
library(GGally)
package ‘GGally’ was built under R version 3.6.2Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
houses_tidy_numeric <- house_price_tidy %>%
select_if(is.numeric)
houses_tidy_nonnumeric <- house_price_tidy %>%
select_if(function(x) !is.numeric(x))
houses_tidy_nonnumeric$price <- house_price_tidy$price
ggpairs(houses_tidy_numeric)
ggpairs(houses_tidy_nonnumeric)
mod1a <- lm(price ~ grade, data = houses_tidy_numeric)
mod1a
Call:
lm(formula = price ~ grade, data = houses_tidy_numeric)
Coefficients:
(Intercept) grade
-1056045 208458
summary(mod1a)
Call:
lm(formula = price ~ grade, data = houses_tidy_numeric)
Residuals:
Min 1Q Median 3Q Max
-816988 -151958 -36158 97842 6046097
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1056045 12256 -86.17 <2e-16 ***
grade 208458 1582 131.76 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 273400 on 21611 degrees of freedom
Multiple R-squared: 0.4455, Adjusted R-squared: 0.4454
F-statistic: 1.736e+04 on 1 and 21611 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(mod1a)
mod2a <- lm(price ~ condition, data = house_data_tidy)
mod2a
Call:
lm(formula = price ~ condition, data = house_data_tidy)
Coefficients:
(Intercept) condition
470147 20514
summary(mod2a)
Call:
lm(formula = price ~ condition, data = house_data_tidy)
Residuals:
Min 1Q Median 3Q Max
-463203 -217203 -87203 101797 7147797
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 470147 13312 35.318 < 2e-16 ***
condition 20514 3835 5.349 8.94e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 366900 on 21611 degrees of freedom
Multiple R-squared: 0.001322, Adjusted R-squared: 0.001276
F-statistic: 28.61 on 1 and 21611 DF, p-value: 8.936e-08
house_remianing_resid <- houses_tidy_numeric %>%
add_residuals(mod1a) %>%
select(-c("price", "grade"))
house_remianing_resid %>%
ggpairs(aes(colour = condition, alpha = 0.5))
Error in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
`mapping` color column must be categorical, not numeric
par(mfrow = c(2,2))
plot(mod2a)
anova(mod1a, mod2a)
Analysis of Variance Table
Model 1: price ~ grade
Model 2: price ~ condition
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21611 1.6153e+15
2 21611 2.9091e+15 0 -1.2938e+15
mod1b <- lm(price ~ long, data = houses_tidy_numeric)
mod1b
Call:
lm(formula = price ~ long, data = houses_tidy_numeric)
Coefficients:
(Intercept) long
7430229 56378
summary(mod1b)
Call:
lm(formula = price ~ long, data = houses_tidy_numeric)
Residuals:
Min 1Q Median 3Q Max
-482447 -218209 -88758 102766 7166063
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7430229 2166748 3.429 0.000606 ***
long 56378 17729 3.180 0.001475 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 367000 on 21611 degrees of freedom
Multiple R-squared: 0.0004677, Adjusted R-squared: 0.0004214
F-statistic: 10.11 on 1 and 21611 DF, p-value: 0.001475
CODECLAN SOLUTION
houses_tidy_numeric <- houses_tidy %>%
select_if(is.numeric)
houses_tidy_nonnumeric <- houses_tidy %>%
select_if(function(x) !is.numeric(x))
houses_tidy_nonnumeric$price <- houses_tidy$price
ggpairs(houses_tidy_numeric)
ggpairs(houses_tidy_nonnumeric)
Correlation of sqft_above with price looks pretty promising, but split of price by grade and waterfront also look decent.
houses_tidy %>%
ggplot(aes(x = grade, y = price)) +
geom_boxplot()
houses_tidy %>%
ggplot(aes(x = waterfront, y = price)) +
geom_boxplot()
mod1_a <- lm(price ~ sqft_above, data = houses_tidy)
summary(mod1_a)
Call:
lm(formula = price ~ sqft_above, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-913132 -165624 -41468 109327 5339232
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59953.2 4729.8 12.68 <2e-16 ***
sqft_above 268.5 2.4 111.87 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 292200 on 21611 degrees of freedom
Multiple R-squared: 0.3667, Adjusted R-squared: 0.3667
F-statistic: 1.251e+04 on 1 and 21611 DF, p-value: < 2.2e-16
mod1_b <- lm(price ~ grade, data = houses_tidy)
summary(mod1_b)
Call:
lm(formula = price ~ grade, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1929615 -135853 -35090 89080 5565658
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 142000 254499 0.558 0.576878
grade3 63667 293870 0.217 0.828484
grade4 72381 258849 0.280 0.779767
grade5 106524 255024 0.418 0.676169
grade6 159920 254561 0.628 0.529868
grade7 260590 254513 1.024 0.305904
grade8 400853 254520 1.575 0.115285
grade9 631513 254547 2.481 0.013112 *
grade10 929771 254611 3.652 0.000261 ***
grade11 1354842 254817 5.317 1.07e-07 ***
grade12 2049222 255909 8.008 1.23e-15 ***
grade13 3567615 264106 13.508 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 254500 on 21601 degrees of freedom
Multiple R-squared: 0.5197, Adjusted R-squared: 0.5195
F-statistic: 2125 on 11 and 21601 DF, p-value: < 2.2e-16
mod1_c <- lm(price ~ waterfront, data = houses_tidy)
summary(mod1_c)
Call:
lm(formula = price ~ waterfront, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1376876 -211564 -81564 108436 7168436
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 531564 2416 220.00 <2e-16 ***
waterfrontTRUE 1130312 27822 40.63 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 353900 on 21611 degrees of freedom
Multiple R-squared: 0.07095, Adjusted R-squared: 0.07091
F-statistic: 1650 on 1 and 21611 DF, p-value: < 2.2e-16
# grade looks the most promising, but some of the grade level coeffs are insignificant.
# the F-test at the bottom of the regression output tests against the null model (i.e. intercept only)
# but, if we want, we can replicate this using a separate anova
# null model: regress price on intercept only
null_model <- lm(price ~ 1, data = houses_tidy)
grade_model <- lm(price ~ grade, data = houses_tidy)
anova(null_model, grade_model)
Analysis of Variance Table
Model 1: price ~ 1
Model 2: price ~ grade
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21612 2.9129e+15
2 21601 1.3991e+15 11 1.5138e+15 2124.8 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# grade is significant, let's keep it. Now plot diagnostics
par(mfrow = c(2, 2))
plot(mod1_b)
not plotting observations with leverage one:
19453
houses_resid <- houses_tidy %>%
add_residuals(mod1b) %>%
select(-c("price", "grade"))
houses_resid_numeric <- houses_resid %>%
select_if(is.numeric)
houses_resid_nonnumeric <- houses_resid %>%
select_if(function(x) !is.numeric(x))
houses_resid_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_resid_numeric)
ggpairs(houses_resid_nonnumeric)
lat has highest correlation with residuals, but, again, waterfront still looks pretty promising. Try both…
mod2_a <- lm(price ~ grade + lat, data = houses_tidy)
summary(mod2_a)
Call:
lm(formula = price ~ grade + lat, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1863783 -112379 -28181 67454 5533910
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -29949769 610666 -49.044 < 2e-16 ***
grade3 187923 276126 0.681 0.496151
grade4 95367 243212 0.392 0.694978
grade5 126354 239618 0.527 0.597980
grade6 158453 239183 0.662 0.507673
grade7 246275 239138 1.030 0.303093
grade8 378635 239144 1.583 0.113369
grade9 603010 239170 2.521 0.011701 *
grade10 891752 239230 3.728 0.000194 ***
grade11 1311124 239425 5.476 4.4e-08 ***
grade12 2007093 240450 8.347 < 2e-16 ***
grade13 3502099 248154 14.113 < 2e-16 ***
lat 633100 11822 53.554 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 239100 on 21600 degrees of freedom
Multiple R-squared: 0.576, Adjusted R-squared: 0.5758
F-statistic: 2445 on 12 and 21600 DF, p-value: < 2.2e-16
mod2_b <- lm(price ~ grade + waterfront, data = houses_tidy)
summary(mod2_b)
Call:
lm(formula = price ~ grade + waterfront, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1929615 -132969 -32393 91481 4778940
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 142000 244366 0.581 0.5612
grade3 63667 282169 0.226 0.8215
grade4 72381 248543 0.291 0.7709
grade5 92834 244870 0.379 0.7046
grade6 155043 244426 0.634 0.5259
grade7 258469 244379 1.058 0.2902
grade8 395393 244386 1.618 0.1057
grade9 623595 244413 2.551 0.0107 *
grade10 909321 244474 3.719 0.0002 ***
grade11 1313326 244674 5.368 8.06e-08 ***
grade12 1947993 245731 7.927 2.35e-15 ***
grade13 3567615 253590 14.068 < 2e-16 ***
waterfrontTRUE 828234 19363 42.773 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 244400 on 21600 degrees of freedom
Multiple R-squared: 0.5572, Adjusted R-squared: 0.557
F-statistic: 2265 on 12 and 21600 DF, p-value: < 2.2e-16
# lat is significant and higher r^2, let's keep model2a
par(mfrow = c(2, 2))
plot(mod2_a)
not plotting observations with leverage one:
19453
houses_resid <- houses_tidy %>%
add_residuals(mod2_a) %>%
select(-c("price", "grade", "lat"))
houses_resid_numeric <- houses_resid %>%
select_if(is.numeric)
houses_resid_nonnumeric <- houses_resid %>%
select_if(function(x) !is.numeric(x))
houses_resid_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_resid_numeric)
ggpairs(houses_resid_nonnumeric)
Now view has strongest correlation with residuals, but also compare against model with waterfront.
mod3_a <- lm(price ~ grade + lat + view, data = houses_tidy)
summary(mod3_a)
Call:
lm(formula = price ~ grade + lat + view, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1665265 -105866 -21393 69623 5429667
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -30514024 576750 -52.907 < 2e-16 ***
grade3 190253 260743 0.730 0.465607
grade4 81058 229663 0.353 0.724133
grade5 112154 226269 0.496 0.620134
grade6 148515 225858 0.658 0.510827
grade7 235345 225815 1.042 0.297328
grade8 351873 225822 1.558 0.119203
grade9 556584 225848 2.464 0.013731 *
grade10 821118 225907 3.635 0.000279 ***
grade11 1200228 226097 5.308 1.12e-07 ***
grade12 1832950 227080 8.072 7.28e-16 ***
grade13 3303587 234361 14.096 < 2e-16 ***
lat 644972 11166 57.764 < 2e-16 ***
view 106862 2086 51.234 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 225800 on 21599 degrees of freedom
Multiple R-squared: 0.6219, Adjusted R-squared: 0.6217
F-statistic: 2733 on 13 and 21599 DF, p-value: < 2.2e-16
mod3_b <- lm(price ~ grade + lat + waterfront, data = houses_tidy)
summary(mod3_b)
Call:
lm(formula = price ~ grade + lat + waterfront, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1862555 -109244 -24781 70145 4724767
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -30511083 581588 -52.462 < 2e-16 ***
grade3 190241 262923 0.724 0.469343
grade4 95796 231583 0.414 0.679130
grade5 112654 228161 0.494 0.621488
grade6 153414 227746 0.674 0.500562
grade7 243828 227703 1.071 0.284264
grade8 372609 227709 1.636 0.101783
grade9 594340 227734 2.610 0.009066 **
grade10 870026 227792 3.819 0.000134 ***
grade11 1267641 227979 5.560 2.72e-08 ***
grade12 1902270 228964 8.308 < 2e-16 ***
grade13 3500877 236288 14.816 < 2e-16 ***
lat 644910 11259 57.278 < 2e-16 ***
waterfrontTRUE 851219 18046 47.168 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 227700 on 21599 degrees of freedom
Multiple R-squared: 0.6156, Adjusted R-squared: 0.6154
F-statistic: 2661 on 13 and 21599 DF, p-value: < 2.2e-16
# view model is best, keep mod3a
par(mfrow = c(2, 2))
plot(mod3_a)
not plotting observations with leverage one:
19453
houses_resid <- houses_tidy %>%
add_residuals(mod3_a) %>%
select(-c("price", "grade", "lat", "view"))
houses_resid_numeric <- houses_resid %>%
select_if(is.numeric)
houses_resid_nonnumeric <- houses_resid %>%
select_if(function(x) !is.numeric(x))
houses_resid_nonnumeric$resid <- houses_resid$resid
ggpairs(houses_resid_numeric)
ggpairs(houses_resid_nonnumeric)
sqft_basement has highest correlation with residuals. Let’s test against all remaining categorical predictors:
mod4_a <- lm(price ~ grade + lat + view + sqft_basement, data = houses_tidy)
summary(mod4_a)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1569582 -105308 -18663 70867 5230039
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.875e+07 5.647e+05 -50.913 < 2e-16 ***
grade3 1.830e+05 2.542e+05 0.720 0.47164
grade4 8.134e+04 2.239e+05 0.363 0.71638
grade5 1.088e+05 2.206e+05 0.493 0.62197
grade6 1.355e+05 2.202e+05 0.615 0.53841
grade7 2.042e+05 2.201e+05 0.927 0.35373
grade8 3.194e+05 2.201e+05 1.451 0.14687
grade9 5.281e+05 2.202e+05 2.399 0.01646 *
grade10 7.859e+05 2.202e+05 3.568 0.00036 ***
grade11 1.156e+06 2.204e+05 5.244 1.59e-07 ***
grade12 1.765e+06 2.214e+05 7.974 1.61e-15 ***
grade13 3.169e+06 2.285e+05 13.869 < 2e-16 ***
lat 6.079e+05 1.094e+04 55.559 < 2e-16 ***
view 8.903e+04 2.101e+03 42.367 < 2e-16 ***
sqft_basement 1.204e+02 3.582e+00 33.600 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 220100 on 21598 degrees of freedom
Multiple R-squared: 0.6407, Adjusted R-squared: 0.6405
F-statistic: 2751 on 14 and 21598 DF, p-value: < 2.2e-16
mod4_b <- lm(price ~ grade + lat + view + waterfront, data = houses_tidy)
summary(mod4_b)
Call:
lm(formula = price ~ grade + lat + view + waterfront, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1713543 -105538 -20871 69842 4901827
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -30758475 564698 -54.469 < 2e-16 ***
grade3 191262 255269 0.749 0.453709
grade4 84892 224841 0.378 0.705758
grade5 106294 221518 0.480 0.631345
grade6 147526 221116 0.667 0.504659
grade7 236375 221074 1.069 0.284985
grade8 354372 221081 1.603 0.108970
grade9 562139 221106 2.542 0.011016 *
grade10 823728 221164 3.725 0.000196 ***
grade11 1197915 221350 5.412 6.30e-08 ***
grade12 1804315 222314 8.116 5.07e-16 ***
grade13 3351868 229446 14.609 < 2e-16 ***
lat 650115 10932 59.466 < 2e-16 ***
view 80422 2217 36.273 < 2e-16 ***
waterfrontTRUE 582422 19024 30.615 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 221100 on 21598 degrees of freedom
Multiple R-squared: 0.6377, Adjusted R-squared: 0.6374
F-statistic: 2715 on 14 and 21598 DF, p-value: < 2.2e-16
mod4_c <- lm(price ~ grade + lat + view + condition, data = houses_tidy)
summary(mod4_c)
Call:
lm(formula = price ~ grade + lat + view + condition, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1665132 -105472 -18392 71578 5447648
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.043e+07 5.673e+05 -53.640 < 2e-16 ***
grade3 1.472e+05 2.595e+05 0.567 0.570486
grade4 6.481e+04 2.291e+05 0.283 0.777293
grade5 7.656e+04 2.258e+05 0.339 0.734605
grade6 1.165e+05 2.257e+05 0.516 0.605628
grade7 2.090e+05 2.257e+05 0.926 0.354406
grade8 3.359e+05 2.257e+05 1.488 0.136720
grade9 5.476e+05 2.257e+05 2.426 0.015270 *
grade10 8.154e+05 2.258e+05 3.612 0.000305 ***
grade11 1.199e+06 2.260e+05 5.307 1.12e-07 ***
grade12 1.834e+06 2.269e+05 8.082 6.69e-16 ***
grade13 3.310e+06 2.339e+05 14.148 < 2e-16 ***
lat 6.432e+05 1.098e+04 58.552 < 2e-16 ***
view 1.017e+05 2.058e+03 49.425 < 2e-16 ***
condition2 -4.275e+02 4.466e+04 -0.010 0.992363
condition3 -6.453e+03 4.154e+04 -0.155 0.876556
condition4 5.758e+04 4.158e+04 1.385 0.166054
condition5 1.349e+05 4.180e+04 3.226 0.001258 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 221800 on 21595 degrees of freedom
Multiple R-squared: 0.6352, Adjusted R-squared: 0.635
F-statistic: 2212 on 17 and 21595 DF, p-value: < 2.2e-16
mod4_d <- lm(price ~ grade + lat + view + renovated, data = houses_tidy)
summary(mod4_d)
Call:
lm(formula = price ~ grade + lat + view + renovated, data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1637628 -105101 -19410 70577 5280904
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -30179084 571198 -52.835 < 2e-16 ***
grade3 188870 258132 0.732 0.464372
grade4 75932 227363 0.334 0.738407
grade5 108602 224003 0.485 0.627808
grade6 141362 223596 0.632 0.527250
grade7 229992 223554 1.029 0.303586
grade8 346281 223561 1.549 0.121413
grade9 550447 223587 2.462 0.013828 *
grade10 817501 223645 3.655 0.000257 ***
grade11 1199432 223833 5.359 8.47e-08 ***
grade12 1835525 224806 8.165 3.39e-16 ***
grade13 3275946 232018 14.119 < 2e-16 ***
lat 637925 11059 57.684 < 2e-16 ***
view 102285 2076 49.261 < 2e-16 ***
renovatedTRUE 159554 7606 20.979 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 223500 on 21598 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6292
F-statistic: 2621 on 14 and 21598 DF, p-value: < 2.2e-16
# looks like model with sqft_basement is best, keep mod4a
par(mfrow = c(2, 2))
plot(mod4_a)
not plotting observations with leverage one:
19453
houses_resid <- houses_tidy %>%
add_residuals(mod4_a) %>%
select(- price)
Our final model in terms of main effects is: price ~ grade + lat + view + sqft_basement
EXTENSION
Consider possible interactions between your four main effect predictors and test their effect upon r2. Choose your best candidate interaction and visualise its effect.
Calculate the relative importance of predictors from your best 4-predictor model (i.e. the model without an interaction). Which predictor affects price most strongly?
Now, for interactions, have six possibilities that obey principle of strong hierarchy (i.e. consider including an interaction only if its main effects are already present in the model)
mod5_a <- lm(price ~ grade + lat + view + sqft_basement + grade:lat, data = houses_tidy)
summary(mod5_a)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement + grade:lat,
data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1496592 -101302 -20837 67958 5196329
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.375e+07 4.424e+07 -0.989 0.3228
grade3 3.068e+07 6.120e+07 0.501 0.6162
grade4 3.760e+07 4.798e+07 0.784 0.4332
grade5 2.917e+07 4.457e+07 0.654 0.5129
grade6 2.420e+07 4.428e+07 0.546 0.5848
grade7 1.912e+07 4.425e+07 0.432 0.6656
grade8 1.546e+07 4.425e+07 0.349 0.7267
grade9 -7.530e+05 4.427e+07 -0.017 0.9864
grade10 -1.492e+07 4.435e+07 -0.336 0.7366
grade11 -1.469e+07 4.466e+07 -0.329 0.7422
grade12 -8.399e+07 4.632e+07 -1.813 0.0698 .
grade13 3.135e+06 2.463e+05 12.728 <2e-16 ***
lat 9.234e+05 9.308e+05 0.992 0.3212
view 8.949e+04 2.086e+03 42.892 <2e-16 ***
sqft_basement 1.208e+02 3.556e+00 33.968 <2e-16 ***
grade3:lat -6.429e+05 1.290e+06 -0.498 0.6183
grade4:lat -7.898e+05 1.010e+06 -0.782 0.4340
grade5:lat -6.115e+05 9.378e+05 -0.652 0.5143
grade6:lat -5.062e+05 9.315e+05 -0.543 0.5869
grade7:lat -3.980e+05 9.310e+05 -0.428 0.6690
grade8:lat -3.186e+05 9.310e+05 -0.342 0.7322
grade9:lat 2.662e+04 9.315e+05 0.029 0.9772
grade10:lat 3.296e+05 9.330e+05 0.353 0.7239
grade11:lat 3.325e+05 9.395e+05 0.354 0.7234
grade12:lat 1.801e+06 9.745e+05 1.848 0.0646 .
grade13:lat NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 218400 on 21588 degrees of freedom
Multiple R-squared: 0.6464, Adjusted R-squared: 0.646
F-statistic: 1644 on 24 and 21588 DF, p-value: < 2.2e-16
mod5_b <- lm(price ~ grade + lat + view + sqft_basement + grade:view, data = houses_tidy)
summary(mod5_b)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement + grade:view,
data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1878838 -104533 -20264 69483 5165978
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.871e+07 5.606e+05 -51.208 < 2e-16 ***
grade3 1.828e+05 2.523e+05 0.725 0.468734
grade4 8.617e+04 2.226e+05 0.387 0.698645
grade5 1.122e+05 2.189e+05 0.513 0.608291
grade6 1.379e+05 2.185e+05 0.631 0.528067
grade7 2.081e+05 2.185e+05 0.952 0.340866
grade8 3.222e+05 2.185e+05 1.475 0.140313
grade9 5.373e+05 2.185e+05 2.459 0.013953 *
grade10 7.518e+05 2.186e+05 3.439 0.000585 ***
grade11 1.084e+06 2.189e+05 4.953 7.35e-07 ***
grade12 1.741e+06 2.208e+05 7.887 3.23e-15 ***
grade13 2.594e+06 2.372e+05 10.939 < 2e-16 ***
lat 6.070e+05 1.086e+04 55.879 < 2e-16 ***
view 4.024e+05 3.764e+04 10.689 < 2e-16 ***
sqft_basement 1.178e+02 3.570e+00 32.987 < 2e-16 ***
grade3:view NA NA NA NA
grade4:view -3.485e+05 1.009e+05 -3.454 0.000553 ***
grade5:view -3.381e+05 4.384e+04 -7.712 1.29e-14 ***
grade6:view -3.360e+05 3.893e+04 -8.631 < 2e-16 ***
grade7:view -3.454e+05 3.794e+04 -9.104 < 2e-16 ***
grade8:view -3.214e+05 3.782e+04 -8.498 < 2e-16 ***
grade9:view -3.326e+05 3.788e+04 -8.782 < 2e-16 ***
grade10:view -2.595e+05 3.800e+04 -6.828 8.81e-12 ***
grade11:view -2.424e+05 3.843e+04 -6.307 2.89e-10 ***
grade12:view -2.972e+05 4.002e+04 -7.428 1.14e-13 ***
grade13:view NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 218500 on 21589 degrees of freedom
Multiple R-squared: 0.6462, Adjusted R-squared: 0.6459
F-statistic: 1715 on 23 and 21589 DF, p-value: < 2.2e-16
mod5_c <- lm(price ~ grade + lat + view + sqft_basement + grade:sqft_basement, data = houses_tidy)
summary(mod5_c)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement + grade:sqft_basement,
data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-2093544 -102714 -19903 68417 4922552
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.922e+07 5.576e+05 -52.398 < 2e-16 ***
grade3 1.849e+05 2.507e+05 0.737 0.460910
grade4 7.737e+04 2.210e+05 0.350 0.726274
grade5 1.098e+05 2.176e+05 0.504 0.613935
grade6 1.427e+05 2.172e+05 0.657 0.511185
grade7 2.183e+05 2.172e+05 1.005 0.314862
grade8 3.305e+05 2.172e+05 1.522 0.128016
grade9 5.195e+05 2.172e+05 2.392 0.016782 *
grade10 7.595e+05 2.173e+05 3.495 0.000475 ***
grade11 1.067e+06 2.176e+05 4.906 9.38e-07 ***
grade12 1.603e+06 2.191e+05 7.318 2.61e-13 ***
grade13 2.047e+06 2.372e+05 8.629 < 2e-16 ***
lat 6.177e+05 1.080e+04 57.166 < 2e-16 ***
view 8.401e+04 2.100e+03 40.011 < 2e-16 ***
sqft_basement 9.153e+02 5.204e+01 17.589 < 2e-16 ***
grade3:sqft_basement NA NA NA NA
grade4:sqft_basement -6.667e+01 1.106e+03 -0.060 0.951943
grade5:sqft_basement -7.957e+02 1.065e+02 -7.470 8.33e-14 ***
grade6:sqft_basement -8.503e+02 5.511e+01 -15.429 < 2e-16 ***
grade7:sqft_basement -8.442e+02 5.238e+01 -16.118 < 2e-16 ***
grade8:sqft_basement -8.273e+02 5.241e+01 -15.785 < 2e-16 ***
grade9:sqft_basement -7.618e+02 5.272e+01 -14.451 < 2e-16 ***
grade10:sqft_basement -7.237e+02 5.317e+01 -13.611 < 2e-16 ***
grade11:sqft_basement -6.234e+02 5.421e+01 -11.499 < 2e-16 ***
grade12:sqft_basement -5.887e+02 5.646e+01 -10.427 < 2e-16 ***
grade13:sqft_basement NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 217100 on 21589 degrees of freedom
Multiple R-squared: 0.6505, Adjusted R-squared: 0.6502
F-statistic: 1747 on 23 and 21589 DF, p-value: < 2.2e-16
mod5_d <- lm(price ~ grade + lat + view + sqft_basement + lat:view, data = houses_tidy)
summary(mod5_d)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement + lat:view,
data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1544431 -103361 -20451 68950 5201833
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.680e+07 5.780e+05 -46.362 < 2e-16 ***
grade3 1.749e+05 2.530e+05 0.691 0.489328
grade4 7.915e+04 2.228e+05 0.355 0.722429
grade5 1.100e+05 2.195e+05 0.501 0.616167
grade6 1.372e+05 2.191e+05 0.626 0.531099
grade7 2.062e+05 2.191e+05 0.941 0.346529
grade8 3.209e+05 2.191e+05 1.464 0.143085
grade9 5.296e+05 2.191e+05 2.417 0.015660 *
grade10 7.858e+05 2.192e+05 3.585 0.000338 ***
grade11 1.152e+06 2.194e+05 5.251 1.53e-07 ***
grade12 1.757e+06 2.203e+05 7.974 1.61e-15 ***
grade13 3.133e+06 2.274e+05 13.776 < 2e-16 ***
lat 5.668e+05 1.125e+04 50.366 < 2e-16 ***
view -1.069e+07 7.448e+05 -14.352 < 2e-16 ***
sqft_basement 1.188e+02 3.567e+00 33.320 < 2e-16 ***
lat:view 2.266e+05 1.566e+04 14.471 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 219100 on 21597 degrees of freedom
Multiple R-squared: 0.6442, Adjusted R-squared: 0.6439
F-statistic: 2607 on 15 and 21597 DF, p-value: < 2.2e-16
mod5_e <- lm(price ~ grade + lat + view + sqft_basement + lat:sqft_basement, data = houses_tidy)
summary(mod5_e)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement + lat:sqft_basement,
data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1550147 -101737 -18763 68303 5174888
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.488e+07 6.359e+05 -39.124 < 2e-16 ***
grade3 1.670e+05 2.532e+05 0.659 0.509600
grade4 7.862e+04 2.230e+05 0.353 0.724442
grade5 1.072e+05 2.197e+05 0.488 0.625492
grade6 1.362e+05 2.193e+05 0.621 0.534705
grade7 2.060e+05 2.193e+05 0.939 0.347638
grade8 3.210e+05 2.193e+05 1.464 0.143228
grade9 5.299e+05 2.193e+05 2.416 0.015700 *
grade10 7.885e+05 2.194e+05 3.594 0.000326 ***
grade11 1.158e+06 2.196e+05 5.274 1.35e-07 ***
grade12 1.765e+06 2.205e+05 8.004 1.26e-15 ***
grade13 3.143e+06 2.276e+05 13.809 < 2e-16 ***
lat 5.264e+05 1.256e+04 41.918 < 2e-16 ***
view 8.911e+04 2.093e+03 42.569 < 2e-16 ***
sqft_basement -1.698e+04 1.310e+03 -12.968 < 2e-16 ***
lat:sqft_basement 3.595e+02 2.752e+01 13.060 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 219300 on 21597 degrees of freedom
Multiple R-squared: 0.6435, Adjusted R-squared: 0.6433
F-statistic: 2599 on 15 and 21597 DF, p-value: < 2.2e-16
mod5_f <- lm(price ~ grade + lat + view + sqft_basement + view:sqft_basement, data = houses_tidy)
summary(mod5_f)
Call:
lm(formula = price ~ grade + lat + view + sqft_basement + view:sqft_basement,
data = houses_tidy)
Residuals:
Min 1Q Median 3Q Max
-1638784 -104221 -19570 69709 5166209
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.883e+07 5.636e+05 -51.155 < 2e-16 ***
grade3 1.833e+05 2.537e+05 0.723 0.469931
grade4 8.399e+04 2.234e+05 0.376 0.707022
grade5 1.118e+05 2.201e+05 0.508 0.611432
grade6 1.385e+05 2.197e+05 0.630 0.528579
grade7 2.087e+05 2.197e+05 0.950 0.342139
grade8 3.238e+05 2.197e+05 1.474 0.140599
grade9 5.312e+05 2.197e+05 2.417 0.015645 *
grade10 7.884e+05 2.198e+05 3.587 0.000335 ***
grade11 1.155e+06 2.200e+05 5.250 1.54e-07 ***
grade12 1.742e+06 2.210e+05 7.886 3.27e-15 ***
grade13 3.138e+06 2.281e+05 13.756 < 2e-16 ***
lat 6.096e+05 1.092e+04 55.822 < 2e-16 ***
view 7.106e+04 2.847e+03 24.962 < 2e-16 ***
sqft_basement 1.056e+02 3.910e+00 27.003 < 2e-16 ***
view:sqft_basement 2.870e+01 3.073e+00 9.339 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 219700 on 21597 degrees of freedom
Multiple R-squared: 0.6422, Adjusted R-squared: 0.6419
F-statistic: 2584 on 15 and 21597 DF, p-value: < 2.2e-16
# mod5c looks like the best
par(mfrow = c(2,2))
plot(mod5_c)
not plotting observations with leverage one:
8620, 19453
It seems that the grade:sqft_basement interaction leads to highest r2 (but two levels of the interaction cannot be determined due to fitting problems).
Now let’s see a visualisation of the effect of this interaction.
houses_resid %>%
ggplot(aes(x = sqft_basement, y = resid, colour = grade)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ grade)
Relative importance of predictors:
library(relaimpo)
Loading required package: MASS
package ‘MASS’ was built under R version 3.6.2
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Loading required package: boot
Loading required package: survey
package ‘survey’ was built under R version 3.6.2Loading required package: grid
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:boot’:
aml
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
Loading required package: mitools
This is the global version of package relaimpo.
If you are a non-US user, a version with the interesting additional metric pmvd is available
from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(mod4_a, method = "lmg", rela = TRUE)
Response variable: price
Total response variance: 134782378397
Analysis based on 21613 observations
14 Regressors:
Some regressors combined in groups:
Group grade : grade3 grade4 grade5 grade6 grade7 grade8 grade9 grade10 grade11 grade12 grade13
Relative importance of 4 (groups of) regressors assessed:
grade lat view sqft_basement
Proportion of variance explained by model: 64.07%
Metrics are normalized to sum to 100% (rela=TRUE).
Relative importance metrics:
lmg
grade 0.67013680
lat 0.10999806
view 0.13594911
sqft_basement 0.08391603
Average coefficients for different model sizes:
1group 2groups 3groups 4groups
grade3 63666.6672 105085.5034 144251.4794 182970.8245
grade4 72381.0351 74840.3850 77838.0192 81340.5229
grade5 106523.9716 106120.0182 106860.8796 108762.8919
grade6 159919.6380 148997.5902 140821.1511 135460.9340
grade7 260590.2629 235879.0272 216962.5200 204157.6486
grade8 400852.7662 366263.3497 339139.6632 319376.1318
grade9 631513.1864 588676.4480 554462.1758 528149.5236
grade10 929771.0746 870400.5336 822843.4234 785879.9257
grade11 1354841.7274 1272639.8932 1206980.5470 1155890.1732
grade12 2049222.0006 1930531.4478 1836851.8205 1765372.6480
grade13 3567615.3852 3398156.8184 3266023.2283 3169155.3827
lat 813411.5832 722508.3996 660525.2637 607867.6492
view 190335.2479 151137.4701 117930.1234 89031.6774
sqft_basement 268.6136 203.8056 154.5114 120.3599
It looks like the grade of property is the most important determiner of price, followed by the number of views the property has received.